curl https://owl.fish.washington.edu/halfshell/genomic-databank/GCF_031168955.1_ASM3116895v1_genomic.fna\
-k \
> ../data/GCF_031168955.1_ASM3116895v1_genomic.fnaExploring what fasta file
head -3 ../data/GCF_031168955.1_ASM3116895v1_genomic.fna## >NC_082382.1 Gadus macrocephalus chromosome 1, ASM3116895v1
## CAGACCTCCAATAGAGAGCTGCTCCCCCTTCGCACAAAGCCGCTGCTGGTGAATGCTCGAAGCGTTGTGTGATTGAATCG
## CTTTAATGCCGTTCCATGTCACGTTGATCGTTTTTTTGCACAACGAGCAAAAAGCTTCCCGGTCATTTCCCATCACGGGT
echo "How many sequences are there?"
grep -c ">" ../data/GCF_031168955.1_ASM3116895v1_genomic.fna## How many sequences are there?
## 24
We want sequences for every gene. Thus, we need to use the genome fasta and gff to extract fastas for every entry in the gtf.
/home/shared/bedtools2/bin/bedtools getfasta -fi ../data/GCF_031168955.1_ASM3116895v1_genomic.fna -bed ../data/GCF_031168955.1_ASM3116895v1.gff -fo ../data/Gmac_genes_fasta.fastaThis is from here picur reviewe sequences I named based on the identify of the database given
cd ../data
curl -O https://ftp.uniprot.org/pub/databases/uniprot/current_release/knowledgebase/complete/uniprot_sprot.fasta.gz
mv uniprot_sprot.fasta.gz uniprot_sprot_r2023_04.fasta.gz
gunzip -k uniprot_sprot_r2023_04.fasta.gz
curl -O https://gannet.fish.washington.edu/seashell/snaps/uniprot_table_r2023_01.tabmkdir ../blastdb
/home/shared/ncbi-blast-2.11.0+/bin/makeblastdb \
-in ../data/uniprot_sprot_r2023_04.fasta \
-dbtype prot \
-out ../blastdb/uniprot_sprot_r2023_04/home/shared/ncbi-blast-2.11.0+/bin/blastx \
-query ../data/Gmac_genes_fasta.fasta \
-db ../blastdb/uniprot_sprot_r2023_04 \
-out ../output/03.2-genome-annotation/Gmac.genesfasta-uniprot_blastx.tab \
-evalue 1E-20 \
-num_threads 20 \
-max_target_seqs 1 \
-outfmt 6 \
&> ../output/03.2-genome-annotation/Gmac.genesfasta-uniprot_blastx.loghead -2 ../output/03.2-genome-annotation/Gmac.genesfasta-uniprot_blastx.tab## NC_082382.1:0-26289739 sp|Q9NYQ7|CELR3_HUMAN 77.838 925 203 1 20384275 20381507 325 1249 0.0 1381
## NC_082382.1:0-26289739 sp|Q9NYQ7|CELR3_HUMAN 31.884 828 520 17 20383957 20381549 323 1131 1.84e-74 295
echo "Number of lines in output"
wc -l ../output/03.2-genome-annotation/Gmac.genesfasta-uniprot_blastx.tab## Number of lines in output
## 18650 ../output/03.2-genome-annotation/Gmac.genesfasta-uniprot_blastx.tab
tr '|' '\t' < ../output/03.2-genome-annotation/Gmac.genesfasta-uniprot_blastx.tab \
> ../output/03.2-genome-annotation/Gmac.genesfasta-uniprot_blastx_sep.tab
head -1 ../output/03.2-genome-annotation/Gmac.genesfasta-uniprot_blastx_sep.tab## NC_082382.1:0-26289739 sp Q9NYQ7 CELR3_HUMAN 77.838 925 203 1 20384275 20381507 325 1249 0.0 1381
bltabl <- read.csv("../output/03.2-genome-annotation/Gmac.genesfasta-uniprot_blastx_sep.tab", sep = '\t', header = FALSE)
spgo <- read.csv("../data/uniprot_table_r2023_01.tab", sep = '\t', header = TRUE)datatable(head(bltabl), options = list(scrollX = TRUE, scrollY = "400px", scrollCollapse = TRUE, paging = FALSE))datatable(head(spgo), options = list(scrollX = TRUE, scrollY = "400px", scrollCollapse = TRUE, paging = FALSE))datatable(
left_join(bltabl, spgo, by = c("V3" = "Entry")) %>%
select(V1, V3, V13, Protein.names, Organism, Gene.Ontology..biological.process., Gene.Ontology.IDs)
# %>% mutate(V1 = str_replace_all(V1,pattern = "solid0078_20110412_FRAG_BC_WHITE_WHITE_F3_QV_SE_trimmed", replacement = "Ab"))
)